# -*- coding:utf8 -*-
# !/usr/bin/env python

'''
#全国企业信用信息公示系统（辽宁）
#维护黄羽
'''
import re
from bs4 import BeautifulSoup

import table
import json
import traceback
import copy
import requests

from utils import kill_captcha
from scpy.logger import get_logger
from scpy.xawesome_time import parse_time

import sd_template_dict as TE
import ln_trans_dict as TR
import sd_format as FO

logger = get_logger(__file__)


def trans_time(a_dict, clean_time_list):
    res_dict = copy.deepcopy(a_dict)
    if not isinstance(a_dict, dict):
        raise ValueError("input is not a dict")
    for k, v in a_dict.items():
        if k in clean_time_list and v:
            res_dict[k] = parse_time(v)
    return res_dict


def trans_money(a_dict, clean_money_list):
    res_dict = copy.deepcopy(a_dict)
    if not isinstance(a_dict, dict):
        raise ValueError("input is not a dict")
    for k, v in a_dict.items():
        if k in clean_money_list and v:
            if isinstance(v, float) or '不公示' in v or '无' in v:
                continue
            elif isinstance(v, int):
                v = float(v)
            elif isinstance(v, basestring):
                v = table.money_notclean(v)
            else:
                raise ValueError("")

            res_dict[k] = v
    return res_dict


ua = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/49.0.2623.108 Chrome/49.0.2623.108 Safari/537.36"

TIME_OUT = 30


def download_captcha_kill(companyName):
    """
    下载验证码，破解，然后搜索公司
    :param companyName:
    :return:若验证码破解成功且公司存在公司，返回公司网页。
            若公司不存在返回None
            若破解的验证码错误，返回''
            若破解过程、访问网页出现失败，抛出异样
    """
    img_url_str = 'http://gsxt.lngs.gov.cn/saicpub/commonsSC/loginDC/securityCode.action?tdate=95283'
    req = requests.session()
    req.headers = {
        'Accept': 'image/webp,image/*,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'zh-CN,zh;q=0.8',
        'Connection': 'keep-alive',
        'Host': 'gsxt.lngs.gov.cn',
        # 'Referer': 'http://gsxt.lngs.gov.cn/saicpub/entPublicitySC/entPublicityDC/entPublicity/search/searchmain.jsp',
        'Referer': 'http://gsxt.lngs.gov.cn/saicpub/entPublicitySC/entPublicityDC/lngsSearchFpc.action',
        'User-Agent': ua,
    }
    img_bin = req.get(img_url_str, timeout=TIME_OUT).content
    if not img_bin:  # 判断下载的验证码是否正确
        return ''
    try:
        res_code = kill_captcha(img_bin, "ln", "jpeg")
    except Exception, e:
        logger.exception("破解验证码的服务，出现异常")
        raise e

    if not res_code or len(res_code) > 100 or str(res_code) in ['None', 'wrong']:
        logger.exception("验证码内容为: \n %s", res_code)
        logger.exception("破解验证码的服务，出现异常,可能是下载的验证码错误，也可能破解服务出现异常")
        return ''  # 返回空字符串，用于重复破解

    logger.info('验证码为:%s' % res_code)

    check_data = {
        'solrCondition': companyName,
        'authCode': res_code,
    }
    check_url = 'http://gsxt.lngs.gov.cn/saicpub/entPublicitySC/entPublicityDC/lngsSearchFpc.action'
    check_res = req.post(check_url, data=check_data, timeout=TIME_OUT).content
    # 判断破解的验证码是否正确，如果验证码错误，1、工商网站仍然在首,2、出错页面。
    if re.compile('严重违法企业名单').findall(check_res) or re.compile('出错了').findall(check_res):
        return ''  # 返回空字符串，用于重复破解

    company_list_json = re.compile('searchList_paging\((.*),.*\);').findall(check_res)

    if not company_list_json:
        return ''
    else:
        company_list = json.loads(company_list_json[0])

    if company_list:
        logger.info("搜索到:%s条信息" % len(company_list))
        return company_list[0]
    else:
        return None  # 搜索的公司不存在


def get_company_info(company_pripid):
    """
    下载网页、年报
    :param company_pripid:首页的网页
    :return:公司字典
    """
    if not company_pripid:
        raise Exception("company_pripid 错误")
    raw_dict = {
        "province": "ln",
        "type": "1",
        "html": "",
        "yearList": [],
        "keyword": "",
        "companyName": "",
        "json": "",
    }
    raw_base_dict = {}
    req = requests.session()
    company_base_info_dict = {
        'pripid': company_pripid.get('pripid', ''),
        'type': company_pripid.get('enttype', ''),
    }
    root_url = 'http://gsxt.lngs.gov.cn/saicpub/entPublicitySC/entPublicityDC/'
    # 获取公司基本信息的地址
    req.headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'zh-CN,zh;q=0.8',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'Host': 'gsxt.lngs.gov.cn',
        'Referer': 'http://gsxt.lngs.gov.cn/saicpub/entPublicitySC/entPublicityDC/sEntDetail.action',
        'X-Requested-With': 'XMLHttpRequest',
    }
    base_res = req.get(root_url + 'getJbxxAction.action?', params=company_base_info_dict, timeout=TIME_OUT).content
    # base_res_soup = BeautifulSoup(base_res, 'html5lib')
    raw_base_dict["base"] = base_res
    # 股东信息
    share_res = req.get(root_url + 'getTzrxxAction.action?', params=company_base_info_dict, timeout=TIME_OUT).content
    raw_base_dict["share"] = share_res

    share_detail = re.compile('tzr_paging\((.*),.*,.*,.*\);').findall(share_res)
    share_detail_list = []
    if share_detail:
        share_url = root_url + 'getGsgsTzrxxPojoList.action?'
        for item in json.loads(share_detail[0]):
            if "invid" in item:
                share_data = {'pripid': company_pripid['pripid'], 'invid': item["invid"], }
                share_detail_res = req.post(share_url, data=share_data, timeout=TIME_OUT).content
                share_detail_list.append(share_detail_res)
    raw_base_dict["share_detail"] = share_detail_list
    # 变更
    alter_res = req.get(root_url + 'getBgxxAction.action?', params=company_base_info_dict, timeout=TIME_OUT).content
    raw_base_dict["alter"] = alter_res

    # 第二页 备案信息
    # 主要人员信息
    person_res = req.get(root_url + 'getZyryxxAction.action?', params=company_base_info_dict, timeout=TIME_OUT).content
    raw_base_dict["person"] = person_res

    # 分支机构信息 待完善
    branch_res = req.get(root_url + 'getFgsxxAction.action?', params=company_base_info_dict, timeout=TIME_OUT).content
    raw_base_dict["branch"] = branch_res

    # 经营异常信息
    abnormal_res = req.get(root_url + 'getJyycxxAction.action?', params=company_base_info_dict, timeout=TIME_OUT).content
    raw_base_dict["abnormal"] = abnormal_res

    # 抽查检查信息
    check_res = req.get(root_url + 'getCcjcxxAction.action?', params=company_base_info_dict, timeout=TIME_OUT).content
    raw_base_dict["check"] = check_res

    raw_dict["html"] = raw_base_dict

    '''
    年报
    '''
    year_index_res = req.get(root_url + 'getQygsQynbxxAction.action?', params=company_base_info_dict, timeout=TIME_OUT).content

    qynbPaging = re.compile('qynbPaging\((.*),.*,.*\);').findall(year_index_res)
    qynbPaging_list = json.loads(qynbPaging[0]) if qynbPaging else []
    raw_year_list = []
    for a_qynbPaging in qynbPaging_list:
        artid = a_qynbPaging.get('artid', '')
        enttype = company_pripid.get('enttype', '')  # 公司类型，前面的
        year = a_qynbPaging.get('ancheyear', '')
        if not all([artid, enttype, year]):
            continue
        year_report_res = req.get(root_url + 'nbDeatil.action?', params={'artId': artid, 'entType': enttype}, timeout=TIME_OUT).content
        raw_year_list.append({'year': year, 'base': year_report_res})

    raw_dict["yearList"] = raw_year_list

    return raw_dict


def extract_base_info(raw_dict):
    if not raw_dict:
        raise Exception("raw_dict 错误")

    raw_html = raw_dict.get("html", {})
    if not raw_html:
        raise Exception("raw_dict 错误")

    res_dict = copy.deepcopy(TE.void_base_dict)

    # 基本信息
    raw_base = raw_html.get("base", "")
    base_table = table.table_clean(raw_base, "基本信息")
    res_base_list = table.index("基本信息", base_table)
    res_dict['province'] = 'ln'
    res_dict['basicList'] = res_base_list

    # 股东信息
    raw_share = raw_html.get("share", "")
    raw_share = re.findall("tzr_paging\((.*),.*,.*,.*\);", raw_share)
    raw_share_list = json.loads(raw_share[0]) if raw_share else []
    res_share_list = []
    for item in raw_share_list:
        res_share_list.append(FO.transform_dict(TE.shareHolder_dict, TR.shareHolder_dict, item))

    # 股东详细信息
    res_share_list_2 = []
    raw_share_detail = raw_html.get("share_detail", [])
    for a_share_detail in raw_share_detail:
        a_share_detail_list = json.loads(a_share_detail)
        if not a_share_detail_list:
            continue
        a_share_detail_dict = a_share_detail_list[0].get("tRegTzrxx", {})
        if a_share_detail_dict:
            res_a_share_detail_dict = FO.transform_dict(TE.shareHolder_dict, TR.shareHolder_dict_detail,
                                                        a_share_detail_dict)
            if "tRegTzrrjxxList" in a_share_detail_list[0] and a_share_detail_list[0]['tRegTzrrjxxList']:
                res_a_share_detail_dict['conDate'] = a_share_detail_list[0]['tRegTzrrjxxList'][0].get('condate', '')
            res_share_list_2.append(res_a_share_detail_dict)
    # 融合
    res_dict['shareHolderList'] = res_share_list_2 if res_share_list_2 else res_share_list

    # 变更信息
    raw_alter = raw_html.get("alter", "")
    raw_alter_list = re.findall('paging\((.*),.*\);', raw_alter)
    raw_alter_list = json.loads(raw_alter_list[0]) if raw_alter_list else []
    res_alter_list = []
    for item in raw_alter_list:
        res_alter_list.append(FO.transform_dict(TE.alter_dict, TR.alter_dict, item))
    res_dict['alterList'] = res_alter_list

    # 主要人员信息
    raw_person = raw_html.get("person", "")
    raw_person_list = re.findall('zyry_nz_paging\((.*),.*\);', raw_person)
    raw_person_list = json.loads(raw_person_list[0]) if raw_person_list else []
    res_person_list = []
    for item in raw_person_list:
        res_person_list.append(FO.transform_dict(TE.person_dict, TR.person_dict, item))
    res_dict['personList'] = res_person_list

    # 分支机构信息
    raw_branch = raw_html.get("branch", "")
    raw_branch_list = re.findall('fzjgPaging\((.*),.*\);', raw_branch)
    raw_branch_list = json.loads(raw_branch_list[0]) if raw_branch_list else []
    res_branch_list = []
    for item in raw_branch_list:
        res_branch_list.append(FO.transform_dict(TE.filiation_dict, TR.filiation_dict, item))
    res_dict['filiationList'] = res_branch_list

    # 经营异常信息
    raw_abnormal = raw_html.get("abnormal", "")
    raw_abnormal_list = re.findall('jyyc_paging\((.*),.*\);', raw_abnormal)
    raw_abnormal_list = json.loads(raw_abnormal_list[0]) if raw_abnormal_list else []
    res_abnormal_list = []
    for item in raw_abnormal_list:
        res_abnormal_list.append(FO.transform_dict(TE.abnormalOperation_dict, TR.abnormalOperation_dict, item))
    res_dict['abnormalOperation'] = res_abnormal_list

    # 抽查检查信息
    raw_check = raw_html.get("check", "")
    raw_check_list = re.findall('ccjc_paging\((.*),.*\);', raw_check)
    raw_check_list = json.loads(raw_check_list[0]) if raw_check_list else []
    res_check_list = []
    for item in raw_check_list:
        res_check_list.append(FO.transform_dict(TE.checkMessage_dict, TR.checkMessage_dict, item))
    res_dict['checkMessage'] = res_check_list

    res_dict = FO.clean_all(res_dict)

    return res_dict


def extract_year_info(raw_dict):
    if not raw_dict:
        raise Exception("raw_dict 错误")

    raw_year_list = raw_dict.get("yearList", [])

    res_year_list = []

    for a_raw_year_item in raw_year_list:
        res_year_dict = copy.deepcopy(TE.void_year_dict)
        res_year_dict["year"] = a_raw_year_item.get("year", "")
        raw_year_base = a_raw_year_item.get("base", "")
        # 基本信息
        year_base = table.table_clean(raw_year_base, "企业基本信息") + table.table_clean(raw_year_base, "基本信息")
        res_year_dict['baseInfo'] = table.report_index("企业基本信息", year_base) if year_base else {}

        # 网站或网店信息
        raw_year_web_js = re.findall('swPaging\((.*)\);', raw_year_base)
        raw_year_web_list = json.loads(raw_year_web_js[0]) if raw_year_web_js else []
        if raw_year_web_list:
            res_year_dict['website'] = FO.transform_dict(TE.website_dict, TR.website_dict, raw_year_web_list[0])

        # 股东及出资信息
        raw_year_investor_js = re.findall('czPaging\((.*)\);', raw_year_base)
        raw_year_investor_list = json.loads(raw_year_investor_js[0]) if raw_year_investor_js else []
        res_year_investor_list = []
        for item in raw_year_investor_list:
            res_year_investor_list.append(
                FO.transform_dict(TE.investorInformations_dict, TR.investorInformations_dict, item))
        res_year_dict['investorInformations'] = res_year_investor_list

        # 对外投资信息
        raw_year_invest_js = re.findall('tzPaging\((.*)\);', raw_year_base)
        raw_year_invest_list = json.loads(raw_year_invest_js[0]) if raw_year_invest_js else []
        res_year_invest_list = []
        for item in raw_year_invest_list:
            res_year_invest_list.append(FO.transform_dict(TE.entinvItem_dict, TR.entinvItem_dict, item))
        res_year_dict['entinvItemList'] = res_year_invest_list

        # 企业资产状况信息
        year_assets = table.table_clean(raw_year_base, "企业资产状况信息")
        res_year_dict['assetsInfo'] = table.report_index("企业资产状况信息", year_assets) if year_assets else []

        # 股权变更信息
        raw_year_equity_js = re.findall('bgPaging\((.*)\);', raw_year_base)
        raw_year_equity_list = json.loads(raw_year_equity_js[0]) if raw_year_equity_js else []
        res_year_equity_list = []
        for item in raw_year_equity_list:
            res_year_equity_list.append(
                FO.transform_dict(TE.equityChangeInformations_dict, TR.equityChangeInformations_dict, item))
        res_year_dict['equityChangeInformations'] = res_year_equity_list
        # year_equity = table.table_clean(raw_year_base, "股权变更信息")
        # res_year_dict['equityChangeInformations'] = table.report_index("股权变更信息", year_equity) if year_equity else []

        # 修改记录
        raw_year_change_js = re.findall('xgPaging\((.*)\);', raw_year_base)
        raw_year_change_list = json.loads(raw_year_change_js[0]) if raw_year_change_js else []
        res_year_change_list = []
        for item in raw_year_change_list:
            res_year_change_list.append(FO.transform_dict(TE.changeRecords_dict, TR.changeRecords_dict, item))
        res_year_dict['changeRecords'] = res_year_change_list
        # year_change = table.table_clean(raw_year_base, "修改记录")
        # res_year_dict['changeRecords'] = table.report_index("修改记录", year_change) if year_change else []

        res_year_list.append(res_year_dict)

    res_year_list = FO.clean_all(res_year_list)

    return res_year_list


def search2(companyName, MAXTIME=40):
    res = ''
    asic_dict = {}
    # MAXTIME = 20
    a_time = MAXTIME
    while a_time > 0:
        # print res, '*'*20
        if res is None:  # 公司不存在
            return None
        elif res == '':  # 验证码错误
            if a_time < MAXTIME:
                logger.error("重复破解验证码!当前设定重复破解次数为:%s, 剩余次数为:%s " % (MAXTIME, a_time))
            a_time -= 1
            try:
                # time.sleep(10)
                res = download_captcha_kill(companyName)
                # print res
            except Exception, e:
                traceback.print_exc(e)
                raise e
        else:
            break
    com_list = res
    res = get_company_info(com_list)
    if a_time <= 1 and res == '':
        raise Exception("多次破解验证码错误,当前设置次数为：%s" % MAXTIME)
    else:
        raw_dict = res
        try:
            asic_dict = extract_base_info(raw_dict)
            year_list = extract_year_info(raw_dict)
            company_name = asic_dict['basicList'][0].get('enterpriseName', '')
            company_name = company_name if company_name else companyName
            res['companyName'] = company_name

            asic_dict['yearReportList'] = year_list
            gate_method = {
                'url': 'http://gsxt.lngs.gov.cn/saicpub/entPublicitySC/entPublicityDC/',
                'method': 'get',
                'province': 'ln',
                'companyName': company_name,
                'data': com_list,
            }

            return res, asic_dict, gate_method

        except Exception, e:
            traceback.print_exc(e)
            raise e
            # logger.info(e)
            # res['companyName'] = companyName
            # gate_method = {
            #     'url': 'http://gsxt.lngs.gov.cn/saicpub/entPublicitySC/entPublicityDC/',
            #     'method': 'get',
            #     'province': 'ln',
            #     'companyName': companyName,
            #     'data': com_list,
            # }
            # return res, None, gate_method


def search(companyName):
    res = search2(companyName)
    if not res:
        return None
    else:
        return res[1]


def search3(gate_method):
    if 'data' not in gate_method:
        raise Exception("gate_method error, doesn't have `data` key")
    com_list = gate_method.get('data')
    res = get_company_info(com_list)
    companyName = gate_method.get('companyName', '')

    raw_dict = res
    try:
        asic_dict = extract_base_info(raw_dict)
        year_list = extract_year_info(raw_dict)
        company_name = asic_dict['basicList'][0].get('enterpriseName', '')
        company_name = company_name if company_name else companyName
        res['companyName'] = company_name

        asic_dict['yearReportList'] = year_list
        gate_method = {
            'url': 'http://gsxt.lngs.gov.cn/saicpub/entPublicitySC/entPublicityDC/',
            'method': 'get',
            'province': 'ln',
            'companyName': company_name,
            'data': com_list,
        }

        return res, asic_dict, gate_method

    except Exception, e:
        logger.info(e)
        res['companyName'] = companyName
        gate_method = {
            'url': 'http://gsxt.lngs.gov.cn/saicpub/entPublicitySC/entPublicityDC/',
            'method': 'get',
            'province': 'ln',
            'companyName': companyName,
            'data': com_list,
        }
        return res, None, gate_method


if __name__ == "__main__":
    companyName = u'中网联合辽宁科技有限公司'
    # companyName = u'东软集团股份有限公司'
    # companyName = u'辽宁聚龙股份有限公司'
    # companyName = u'辽宁衡业集团有限公司'
    # companyName = u'辽宁远大集团有限公司 '
    # companyName = u'营口明大投资有限公司'
    # companyName = u'大连轩丰医用用品有限公司'
    # companyName = u'灯塔市五星镇西三家子养鸡专业合作社'


    # province = 'gx'
    # result = search(companyName)
    result = search2(companyName)
    import json

    print json.dumps(result, indent=4, ensure_ascii=False)
